/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.avro.xml; import java.io.File; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.ListIterator; import java.util.Map; import javax.xml.namespace.QName; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; import org.apache.ws.commons.schema.XmlSchemaAll; import org.apache.ws.commons.schema.XmlSchemaAnnotation; import org.apache.ws.commons.schema.XmlSchemaAnnotationItem; import org.apache.ws.commons.schema.XmlSchemaAny; import org.apache.ws.commons.schema.XmlSchemaAnyAttribute; import org.apache.ws.commons.schema.XmlSchemaAttribute; import org.apache.ws.commons.schema.XmlSchemaChoice; import org.apache.ws.commons.schema.XmlSchemaDocumentation; import org.apache.ws.commons.schema.XmlSchemaElement; import org.apache.ws.commons.schema.XmlSchemaSequence; import org.apache.ws.commons.schema.XmlSchemaUse; import org.apache.ws.commons.schema.constants.Constants; import org.apache.ws.commons.schema.walker.XmlSchemaAttrInfo; import org.apache.ws.commons.schema.walker.XmlSchemaTypeInfo; import org.apache.ws.commons.schema.walker.XmlSchemaVisitor; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.JsonNodeFactory; import org.codehaus.jackson.node.ObjectNode; import org.w3c.dom.NodeList; /** * Generates an Avro schema based on the walked XML Schema. */ final class AvroSchemaGenerator implements XmlSchemaVisitor { private Schema root; private final List<URL> schemaUrls; private final List<File> schemaFiles; private final String baseUri; private final ArrayList<StackEntry> stack; private final Map<QName, Schema> schemasByElement; private final Map<QName, List<Schema>> substitutionGroups; private final Map<QName, List<Schema>> fieldsByElement; private final Map<QName, List<AttributeEntry>> attributesByElement; private static class StackEntry { final QName elementQName; final boolean isSubstitutionGroup; public StackEntry( final QName elementQName, final boolean isSubstitutionGroup) { this.elementQName = elementQName; this.isSubstitutionGroup = isSubstitutionGroup; } } private static class AttributeEntry { private final Schema.Field schemaField; private final boolean isNonOptionalIdField; AttributeEntry(Schema.Field field, boolean isNonOptionalIdField) { this.schemaField = field; this.isNonOptionalIdField = isNonOptionalIdField; } Schema.Field getField() { return schemaField; } boolean isNonOptionalIdField() { return isNonOptionalIdField; } } AvroSchemaGenerator( String baseUri, List<URL> schemaUrls, List<File> schemaFiles) { this.baseUri = baseUri; this.schemaUrls = schemaUrls; this.schemaFiles = schemaFiles; root = null; stack = new ArrayList<StackEntry>(); schemasByElement = new HashMap<QName, Schema>(); substitutionGroups = new HashMap<QName, List<Schema>>(); fieldsByElement = new HashMap<QName, List<Schema>>(); attributesByElement = new HashMap<QName, List<AttributeEntry>>(); } /** * Clears the internal state of the {@link AvroSchemaGenerator} * so a new Avro {@link Schema} can be generated from a new * {@link org.apache.ws.commons.schema.XmlSchema}. */ void clear() { root = null; stack.clear(); schemasByElement.clear(); substitutionGroups.clear(); fieldsByElement.clear(); attributesByElement.clear(); } /** * The generated {@link Schema}, or <code>null</code> if none. * * @return The generated {@link Schema}, or <code>null</code> if none. */ Schema getSchema() { return root; } /** * If this element was not added previously, registers a new schema for it. * If this element was added previously, and it is not part of a substitution * group, add it to its parent. * * @see XmlSchemaVisitor#onEnterElement(XmlSchemaElement, XmlSchemaTypeInfo, boolean) */ @Override public void onEnterElement( final XmlSchemaElement element, final XmlSchemaTypeInfo typeInfo, final boolean previouslyVisited) { final QName elemQName = element.getQName(); final StackEntry substGrp = getSubstitutionGroup(); if (previouslyVisited) { if (substGrp == null) { /* This element is not part of a substitution group, so * it is simply a child of its parent. Add it as such. * * If this were a member of a substitution group, then it * would have been added to that substitution group the first * time it was encountered. Once the substitution group has * been fully processed, all of its elements will be added * to the proper parent(s) at once. Likewise, nothing needs * to be done here. */ final Schema schema = schemasByElement.get(elemQName); if (schema == null) { throw new IllegalStateException( "Element \"" + element.getQName() + "\" was previously visited, but has no schema."); } addSchemaToParent(schema); } return; } /* If this element is abstract, it is not * a member of the substitution group. */ if ((substGrp != null) && element.isAbstract()) { return; } // If documentation is available, makes it the record's documentation. final String documentation = getDocumentationFor( element.getAnnotation() ); // Create the record. Schema record = null; String avroNamespace = null; try { avroNamespace = Utils.getAvroNamespaceFor( elemQName.getNamespaceURI() ); } catch (URISyntaxException e) { throw new IllegalArgumentException( "Element \"" + elemQName + "\" has an invalid namespace of \"" + elemQName.getNamespaceURI() + "\"", e); } record = Schema.createRecord( elemQName.getLocalPart(), documentation, avroNamespace, false); schemasByElement.put(elemQName, record); } /** * If this element was not visited previously, retrieves all of the fields * associated with the record corresponding to the element, and adds them. * * @see XmlSchemaVisitor#onExitElement(XmlSchemaElement, XmlSchemaTypeInfo, boolean) */ @Override public void onExitElement( XmlSchemaElement element, XmlSchemaTypeInfo typeInfo, boolean previouslyVisited) { if (previouslyVisited) { /* No element was added to the stack, so no element should be removed * from the stack. This element either has been processed already or * will be processed later. */ return; } else if ((getSubstitutionGroup() != null) && element.isAbstract()) { /* This element is part of a substitution group and was declared * abstract, so it will not be a valid child element. */ return; } final StackEntry entry = pop(element.getQName(), false); Schema record = schemasByElement.get(entry.elementQName); if (record == null) { throw new IllegalStateException( "No schema found for element \"" + entry.elementQName + "\"."); } else if (record.getType().equals(Schema.Type.MAP)) { record = record.getValueType(); } final List<AttributeEntry> fields = attributesByElement.get(entry.elementQName); ArrayList<Schema.Field> schemaFields = null; if (fields == null) { schemaFields = new ArrayList<Schema.Field>(1); } else { schemaFields = new ArrayList<Schema.Field>( fields.size() ); for (AttributeEntry attrEntry : fields) { schemaFields.add( attrEntry.getField() ); } } final List<Schema> children = fieldsByElement.get(entry.elementQName); /* If there are multiple maps in the children, merge * them together under one MAP of UNION of children. */ if ((children != null) && !children.isEmpty()) { int mapIndex = -1; ArrayList<Schema> mapUnion = null; ArrayList<Integer> indicesToRemove = null; for (int i = 0; i < children.size(); ++i) { Schema currSchema = children.get(i); if (currSchema.getType().equals(Schema.Type.MAP)) { if (mapIndex < 0) { // This is the first map. mapIndex = i; } else if (mapUnion == null) { // This is the second map. mapUnion = new ArrayList<Schema>(); final Schema mapSchema = children.get(mapIndex); mapUnion.add( mapSchema.getValueType() ); mapUnion.add( currSchema.getValueType() ); indicesToRemove = new ArrayList<Integer>(); indicesToRemove.add(i); } else { // These are maps 3+. mapUnion.add( currSchema.getValueType() ); indicesToRemove.add(i); } } } if (mapUnion != null) { // 1. Create a union of all MAP children. children.set( mapIndex, Schema.createMap( Schema.createUnion(mapUnion) )); // 2. Remove all other indices with MAPs in them. ListIterator<Integer> iter = indicesToRemove.listIterator(indicesToRemove.size()); while ( iter.hasPrevious() ) { children.remove( iter.previous().intValue() ); } } // Now, remove duplicate children. final HashSet<String> duplicates = new HashSet<String>(); if (indicesToRemove != null) { indicesToRemove.clear(); } for (int childIndex = 0; childIndex < children.size(); ++childIndex) { Schema child = children.get(childIndex); if ( duplicates.contains( child.getFullName() ) ) { if (indicesToRemove == null) { indicesToRemove = new ArrayList<Integer>(); } indicesToRemove.add(childIndex); } else { duplicates.add( child.getFullName() ); } } if ((indicesToRemove != null) && !indicesToRemove.isEmpty()) { ListIterator<Integer> iter = indicesToRemove.listIterator(indicesToRemove.size()); while ( iter.hasPrevious() ) { children.remove( iter.previous().intValue() ); } } } if ((children != null) && !children.isEmpty() && (typeInfo != null) && (typeInfo.getUserRecognizedType() != null)) { throw new IllegalStateException( "Element \"" + entry.elementQName + "\" has both a type (" + typeInfo.getUserRecognizedType() + ") and " + children.size() + " child elements."); } else if ((children != null) && !children.isEmpty()) { boolean isMixedType = false; if (typeInfo != null) { isMixedType = typeInfo.isMixed(); } if (isMixedType) { boolean foundString = false; for (Schema child : children) { if ( child.getType().equals(Schema.Type.STRING) ) { foundString = true; break; } } if (!foundString) { children.add( Schema.create(Schema.Type.STRING) ); } } final Schema schema = Schema.createArray( Schema.createUnion(children) ); final Schema.Field field = new Schema.Field( entry.elementQName.getLocalPart(), schema, "Children of " + entry.elementQName, null); schemaFields.add(field); } else if ((typeInfo != null) && (typeInfo.getType().equals(XmlSchemaTypeInfo.Type.LIST) || typeInfo.getType().equals(XmlSchemaTypeInfo.Type.UNION) || typeInfo.isMixed() || (typeInfo.getUserRecognizedType() != null))) { final Schema childSchema = Utils.getAvroSchemaFor( typeInfo, element.getQName(), element.isNillable()); final Schema.Field field = new Schema.Field( entry.elementQName.getLocalPart(), childSchema, "Simple type " + typeInfo.getUserRecognizedType(), null); schemaFields.add(field); } else if (((children == null) || children.isEmpty()) && ((typeInfo == null) || (typeInfo.getUserRecognizedType() == null))) { // This element has no children. Set a null placeholder. final Schema.Field field = new Schema.Field( entry.elementQName.getLocalPart(), Schema.create(Type.NULL), "This element contains no attributes and no children.", null); schemaFields.add(field); } record.setFields(schemaFields); } /** * Processes the attribute of the provided element, * adding it as a field to the corresponding record. * * @see XmlSchemaVisitor#onVisitAttribute(XmlSchemaElement, XmlSchemaAttrInfo) */ @Override public void onVisitAttribute( XmlSchemaElement element, XmlSchemaAttrInfo attrInfo) { if ((getSubstitutionGroup() != null) && element.isAbstract()) { // Abstract elements are ignored. return; } final XmlSchemaAttribute attribute = attrInfo.getAttribute(); final XmlSchemaTypeInfo attributeType = attrInfo.getType(); if ( attribute.getUse().equals(XmlSchemaUse.PROHIBITED) ) { // This attribute is prohibited and cannot be part of the record. return; } final QName elemQName = element.getQName(); final QName attrQName = attribute.getQName(); final String documentation = getDocumentationFor( attribute.getAnnotation() ); boolean isOptional = false; // Optional types are unions of the real type and null. if (attribute.getUse().equals(XmlSchemaUse.OPTIONAL) && (attribute.getDefaultValue() == null) && (attribute.getFixedValue() == null)) { isOptional = true; } boolean isIdField = false; if ((attributeType.getUserRecognizedType() != null) && attributeType.getUserRecognizedType().equals(Constants.XSD_ID)) { isIdField = true; } Schema attrSchema = Utils.getAvroSchemaFor( attributeType, attribute.getQName(), isOptional); final Schema.Field attr = new Schema.Field( attrQName.getLocalPart(), attrSchema, documentation, null); List<AttributeEntry> attrs = attributesByElement.get(elemQName); if (attrs == null) { attrs = new ArrayList<AttributeEntry>(); attributesByElement.put(elemQName, attrs); } attrs.add( new AttributeEntry(attr, isIdField && !isOptional) ); } @Override public void onEndAttributes( XmlSchemaElement element, XmlSchemaTypeInfo elemTypeInfo) { final QName elemQName = element.getQName(); final StackEntry substGrp = getSubstitutionGroup(); if ((substGrp != null) && element.isAbstract()) { return; } final List<AttributeEntry> fields = attributesByElement.get(elemQName); Schema record = schemasByElement.get(elemQName); if (record == null) { throw new IllegalStateException( "No schema found for element \"" + elemQName + "\"."); } /* If this RECORD contains exactly one non-optional ID attribute, it is * better served as a MAP. However, the root element of a document cannot * be a map; it would have no siblings. */ if (!stack.isEmpty() && ((stack.size() > 1) || !stack.get(0).isSubstitutionGroup) && isMap(fields)) { record = Schema.createMap(record); schemasByElement.put(elemQName, record); } if (substGrp != null) { /* This element is part of a substitution group. * It will be added to its parent(s) later. */ List<Schema> substitutionSchemas = substitutionGroups.get(substGrp.elementQName); if (substitutionSchemas == null) { substitutionSchemas = new ArrayList<Schema>(); substitutionGroups.put(substGrp.elementQName, substitutionSchemas); } substitutionSchemas.add(record); } else if ( stack.isEmpty() ) { // This is the root element! root = record; addXmlSchemasListToRoot( element.getQName() ); } else { /* This is not part of a substitution group, * and not the root. Add it to its parent. */ addSchemaToParent(record); } stack.add( new StackEntry(elemQName, false) ); } /** * Adds a new stack entry for this substitution group. * * @see XmlSchemaVisitor#onEnterSubstitutionGroup(XmlSchemaElement) */ @Override public void onEnterSubstitutionGroup(XmlSchemaElement base) { stack.add( new StackEntry(base.getQName(), true) ); } /** * Retrieves all of the members of this substitution * group and adds them to the parent as children. * * @see XmlSchemaVisitor#onExitSubstitutionGroup(XmlSchemaElement) */ @Override public void onExitSubstitutionGroup(XmlSchemaElement base) { final StackEntry entry = pop(base.getQName(), true); final List<Schema> substitutes = substitutionGroups.get(entry.elementQName); if ((substitutes == null) || substitutes.isEmpty()) { /* This happens when an abstract element can only * be substituted by other abstract elements. */ return; } if ( stack.isEmpty() ) { // The root node in the stack is part of a substitution group. root = Schema.createUnion(substitutes); addXmlSchemasListToRoot( base.getQName() ); } else { // The substitution group is part of a higher group. StackEntry parent = getParentElement(); List<Schema> siblings = fieldsByElement.get(parent.elementQName); if (siblings == null) { siblings = new ArrayList<Schema>( substitutes.size() ); } siblings.addAll(substitutes); } } /** * Avro schemas do not handle different group * types differently. This is a no-op. * * @see XmlSchemaVisitor#onEnterAllGroup(XmlSchemaAll) */ @Override public void onEnterAllGroup(XmlSchemaAll all) { } /** * Avro schemas do not handle different group * types differently. This is a no-op. * * @see XmlSchemaVisitor#onExitAllGroup(XmlSchemaAll) */ @Override public void onExitAllGroup(XmlSchemaAll all) { } /** * Avro schemas do not handle different group * types differently. This is a no-op. * * @see XmlSchemaVisitor#onEnterChoiceGroup(XmlSchemaChoice) */ @Override public void onEnterChoiceGroup(XmlSchemaChoice choice) { } /** * Avro schemas do not handle different group * types differently. This is a no-op. * * @see XmlSchemaVisitor#onExitChoiceGroup(XmlSchemaChoice) */ @Override public void onExitChoiceGroup(XmlSchemaChoice choice) { } /** * Avro schemas do not handle different group * types differently. This is a no-op. * * @see XmlSchemaVisitor#onEnterSequenceGroup(XmlSchemaSequence) */ @Override public void onEnterSequenceGroup(final XmlSchemaSequence seq) { } /** * Avro schemas do not handle different group * types differently. This is a no-op. * * @see XmlSchemaVisitor#onExitSequenceGroup(XmlSchemaSequence) */ @Override public void onExitSequenceGroup(final XmlSchemaSequence seq) { } /** * Avro schemas do not have support for an * "anything" type. This method is a no-op. * * @see XmlSchemaVisitor#onVisitAny(XmlSchemaAny) */ @Override public void onVisitAny(XmlSchemaAny any) { } /** * Avro schemas do not have support for an * "anything" type. This method is a no-op. * * @see XmlSchemaVisitor#onVisitAnyAttribute(XmlSchemaElement, XmlSchemaAnyAttribute) */ @Override public void onVisitAnyAttribute(final XmlSchemaElement element, final XmlSchemaAnyAttribute anyAttr) { } /** * Retrieves the <code>StackEntry</code> representing the parent element. * Traverses through as many substitution groups as necessary to find it. * * @return The <code>StackEntry</code> representing the parent element. * @exception IllegalStateException if there is no parent element. */ private StackEntry getParentElement() { ListIterator<StackEntry> iterator = stack.listIterator( stack.size() ); while ( iterator.hasPrevious() ) { StackEntry entry = iterator.previous(); if (!entry.isSubstitutionGroup) { return entry; } } throw new IllegalStateException("No parent element available in stack."); } /** * If we are processing a substitution group, returns the * <code>StackEntry</code> representing that substitution * group. Otherwise, returns <code>null</code>. * * @return A <code>StackEntry</code> for the represented * substitution group, or <code>null</code> if none. */ private StackEntry getSubstitutionGroup() { final ListIterator<StackEntry> iterator = stack.listIterator(stack.size()); if (iterator.hasPrevious()) { StackEntry prev = iterator.previous(); if (prev.isSubstitutionGroup) { return prev; } } return null; } private void addSchemaToParent(final Schema schema) { final StackEntry parent = getParentElement(); List<Schema> siblings = fieldsByElement.get(parent.elementQName); if (siblings == null) { siblings = new ArrayList<Schema>(); fieldsByElement.put(parent.elementQName, siblings); } siblings.add(schema); } private StackEntry pop(QName entryQName, boolean isSubstGroup) { if ( stack.isEmpty() ) { throw new IllegalStateException( "Attempted to pop " + getStackEntryInfo(entryQName, isSubstGroup) + " off of an empty stack."); } final StackEntry entry = stack.remove(stack.size() - 1); if (!entry.elementQName.equals(entryQName) || (entry.isSubstitutionGroup != isSubstGroup)) { throw new IllegalStateException( "Attempted to pop " + getStackEntryInfo(entryQName, isSubstGroup) + " but found " + getStackEntryInfo(entry.elementQName, entry.isSubstitutionGroup)); } return entry; } private void addXmlSchemasListToRoot(QName rootTagQName) { if (((schemaUrls == null) || schemaUrls.isEmpty()) && ((schemaFiles == null) || schemaFiles.isEmpty()) && ((baseUri == null) || !baseUri.isEmpty())) { return; } final ObjectNode schemasNode = JsonNodeFactory.instance.objectNode(); if ((schemaUrls != null) && !schemaUrls.isEmpty()) { final ArrayNode urlArrayNode = JsonNodeFactory.instance.arrayNode(); for (URL schemaUrl : schemaUrls) { urlArrayNode.add( schemaUrl.toString() ); } schemasNode.put("urls", urlArrayNode); } if ((schemaFiles != null) && !schemaFiles.isEmpty()) { final ArrayNode fileArrayNode = JsonNodeFactory.instance.arrayNode(); for (File schemaFile : schemaFiles) { fileArrayNode.add( schemaFile.getAbsolutePath() ); } schemasNode.put("files", fileArrayNode); } if ((baseUri != null) && !baseUri.isEmpty()) { schemasNode.put("baseUri", baseUri); } final ObjectNode rootTagNode = JsonNodeFactory.instance.objectNode(); rootTagNode.put("namespace", rootTagQName.getNamespaceURI()); rootTagNode.put("localPart", rootTagQName.getLocalPart()); schemasNode.put("rootTag", rootTagNode); if ( root.getType().equals(Schema.Type.RECORD) ) { root.addProp("xmlSchemas", schemasNode); } else if ( root.getType().equals(Schema.Type.UNION) ) { if ((root.getTypes() == null) || root.getTypes().isEmpty()) { throw new IllegalStateException( "Root is a substitution group with no children!"); } final Schema firstElem = root.getTypes().get(0); if ( !firstElem.getType().equals(Schema.Type.RECORD) ) { throw new IllegalStateException( "Root is a substitution group with a first element of type " + firstElem.getType()); } firstElem.addProp("xmlSchemas", schemasNode); } else { throw new IllegalStateException( "Document root is neither a RECORD nor a UNION."); } } private static boolean isMap(final List<AttributeEntry> attributes) { int nonOptionalIdFieldCount = 0; if (attributes != null) { for (AttributeEntry attribute : attributes) { if ( attribute.isNonOptionalIdField() ) { ++nonOptionalIdFieldCount; } } } return (nonOptionalIdFieldCount == 1); } private static String getStackEntryInfo( QName entryQName, boolean isSubstGroup) { return "\"" + entryQName + "\" (Substitution Group? " + isSubstGroup + ")"; } private static String getDocumentationFor(XmlSchemaAnnotation annotation) { if ((annotation != null) && (annotation.getItems() != null) && !annotation.getItems().isEmpty()) { StringBuilder docs = new StringBuilder(); for (XmlSchemaAnnotationItem item : annotation.getItems()) { if (item instanceof XmlSchemaDocumentation) { final NodeList docNodes = ((XmlSchemaDocumentation) item).getMarkup(); for (int nodeIdx = 0; nodeIdx < docNodes.getLength(); ++nodeIdx) { docs.append( docNodes .item(nodeIdx) .getTextContent() .replaceAll("\\s+", " ")); } break; } } return docs.toString(); } return null; } }